# Install all related packages which will be Used
import time
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from pandas.plotting import scatter_matrix
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate, cross_val_score
def pca_and_visualize(input_file, output_file, unit_index, label):
"""
:param input_file: str, input data file path
:param output_file: str, output file path
:param unit_index: str, column name of unit index
:param label: str, column name of label
"""
# Read the raw data.csv file
df = pd.read_csv(input_file)
# Delete the Label=2 value in dataframe
df = df[df['Label'] != 2]
# Separate the data frame of Patient index and multiple feature columns
df_features = df.drop([label], axis=1)
# Separate the data frame with Patient index and Label
df_labels = df[[unit_index, label]]
# Perform PCA dimensionality reduction on the feature columns
pca = PCA(n_components=10)
pca_result = pca.fit_transform(df_features.iloc[:, 1:].values)
# Build a new data frame
df_pca = pd.DataFrame(pca_result, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10'])
df_pca.insert(0, unit_index, df_features[unit_index])
# Align and merge the second data frame with the new data frame obtained in step three using a common Patient index
df_new = pd.merge(df_pca, df_labels, on=unit_index)
# Save PCA result to CSV file
df_new.to_csv(output_file, index=False)
# Draw scatter plot (PC1, PC2, PC3)
fig = plt.figure(figsize=(10, 10)) # Set the size of the Figure
ax = fig.add_subplot(projection='3d')
groups = df_new.groupby(label)
for name, group in groups:
ax.scatter(group['PC1'], group['PC2'], group['PC3'], label=name)
ax.legend()
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
plt.title('PCA')
plt.subplots_adjust(left=0.1) # Adjust left margin whitespace
plt.show()
fig.savefig('PCA10_scatter_plot.png')
# Draw scatter matrix
scatter_matrix(df_new.iloc[:, :-1], c=df_new[label], figsize=(20, 20), marker='o')
plt.show()
fig.savefig('PCA10_scatter_matrix.png')
# Draw heatmap
plt.figure(figsize=(15, 15))
ax = plt.subplot()
sns.heatmap(df_new.corr(), annot=True, ax=ax)
plt.show()
fig.savefig('PCA10_heatmap.png')
# Test model performance
print('Variance contribution rates (10):',
sum(pca.explained_variance_ratio_))
# Additionally, calculate variance contribution rates of the first 2, 3, 6 principal components
pca_2 = PCA(n_components=2)
pca_3 = PCA(n_components=3)
pca_6 = PCA(n_components=6)
pca_2.fit(df_features.iloc[:, 1:].values)
pca_3.fit(df_features.iloc[:, 1:].values)
pca_6.fit(df_features.iloc[:, 1:].values)
print('Variance contribution rates (2):',
sum(pca_2.explained_variance_ratio_))
print('Variance contribution rates (3):',
sum(pca_3.explained_variance_ratio_))
print('Variance contribution rates (6):',
sum(pca_6.explained_variance_ratio_))
# Fit PCA model with cross-validation
print('Cross validate scores:',
cross_validate(pca, df, cv=10))
if __name__ == '__main__':
start_time = time.time()
pca_and_visualize('Data.csv', 'PCA10_data.csv', 'Patient index', 'Label')
end_time = time.time()
print('Execution time: {:.2f}s'.format(end_time - start_time))
Variance contribution rates (10): 0.8294108531299192 Variance contribution rates (2): 0.32479715122708375 Variance contribution rates (3): 0.41037516493120607 Variance contribution rates (6): 0.611955410437872 Cross validate scores: {'fit_time': array([0.00897574, 0.00897503, 0.00797915, 0.0079782 , 0.00798011, 0.00897431, 0.00897479, 0.00797939, 0.0079782 , 0.00797892]), 'score_time': array([0.00199556, 0.00099754, 0.0009973 , 0.00099778, 0.00099683, 0.00099897, 0.0009973 , 0.00099707, 0.00199604, 0.00099707]), 'test_score': array([-19.9893636 , -18.25827548, -17.671087 , -17.41411827, -17.77187038, -18.07353759, -18.5401708 , -18.59057024, -18.80471495, -19.59787247])} Execution time: 9.64s
def knn_classification(input_file, output_file, unit_index, label, k_neighbors):
"""
:param input_file: str, input data file path
:param output_file: str, output file path
:param unit_index: str, column name of unit index
:param label: str, column name of label
:param k_neighbors: int, k neighbors nearby
"""
# Read the input file
df = pd.read_csv(input_file)
# Separate the data frame of Patient index, PCs columns
df_features = df.drop([unit_index, label], axis=1)
# Separate the data frame with Patient index and Label
df_labels = df[[unit_index, label]]
# Fit KNN model
knn = KNeighborsClassifier(n_neighbors=k_neighbors)
knn.fit(df_features, df_labels[label])
# Predict labels
predicted_labels = knn.predict(df_features)
df_labels['Predicted Label'] = predicted_labels
# Export the new data frame with predicted labels as output file
df_labels.to_csv(output_file, index=False)
# Compute the confusion matrix
tp = ((df_labels['Predicted Label'] == 1) & (df_labels[label] == 1)).sum()
tn = ((df_labels['Predicted Label'] == 0) & (df_labels[label] == 0)).sum()
fp = ((df_labels['Predicted Label'] == 1) & (df_labels[label] == 0)).sum()
fn = ((df_labels['Predicted Label'] == 0) & (df_labels[label] == 1)).sum()
# Compute the accuracy, precision, and recall
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
F1_score = 2 * (precision * recall / (precision + recall))
# Draw KNN scatter plot with predicted labels
plt.figure(figsize=(10, 8))
fig, ax = plt.subplots()
groups = df_labels.groupby('Predicted Label')
for name, group in groups:
ax.scatter(df.loc[group.index, 'PC1'], df.loc[group.index, 'PC2'], label=name)
ax.legend()
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('KNN Classification')
plt.show()
# Compute confusion matrix
cm = confusion_matrix(df_labels[label], df_labels['Predicted Label'])
# Plot confusion matrix using seaborn heatmap
plt.figure(figsize=(5, 5))
sns.heatmap(cm, annot=True, cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()
# print out the accuracy, precision, recall, F1 score of KNN classifier model
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', F1_score)
# Fit KNN model with cross-validation
scores = cross_validate(knn, df_features, df_labels[label], cv=10)
print("Cross validate scores:", scores)
if __name__ == '__main__':
start_time = time.time()
knn_classification('PCA10_data.csv', 'new_knn_data.csv', 'Patient index', 'Label', 47)
end_time = time.time()
print('Execution time: {:.2f}s'.format(end_time - start_time))
C:\Users\Scort\AppData\Local\Temp\ipykernel_30896\1958426232.py:25: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_labels['Predicted Label'] = predicted_labels
<Figure size 1000x800 with 0 Axes>
Accuracy: 0.6243416102332581 Precision: 0.6045510455104551 Recall: 0.4206247325631151 F1 score: 0.49608882159979817 Cross validate scores: {'fit_time': array([0.00997424, 0.01196909, 0.00897527, 0.00897622, 0.00897574, 0.00897527, 0.00897551, 0.00897598, 0.01097059, 0.00997329]), 'score_time': array([0.06482625, 0.05385661, 0.05526328, 0.05485344, 0.05186129, 0.05385709, 0.05585003, 0.0548532 , 0.05086446, 0.05485249]), 'test_score': array([0.54887218, 0.57330827, 0.54135338, 0.58458647, 0.54699248, 0.67481203, 0.68926554, 0.63653484, 0.48964218, 0.47269303])} Execution time: 1.62s
def svm_classification(input_file, output_file, unit_index, label, C=1.0, kernel='linear', gamma='scale'):
"""
:param input_file: str, input data file path
:param output_file: str, output file path
:param unit_index: str, column name of unit index
:param label: str, column name of label
:param C: float, degree of punishment for controlling classification errors
:param kernel: str, kernel function which can improve the accuracy and generalization ability of the classifier
:param gamma: {'scale', 'auto'} or float, affects the Gaussian kernel function
"""
# Read input data file
df = pd.read_csv(input_file)
# Separate the data frame of Patient index, PCs columns
df_features = df.drop([unit_index, label], axis=1)
# Separate the data frame with Patient index and Label
df_labels = df[[unit_index, label]]
# Fit SVM model
svm = SVC(C=C, kernel=kernel, gamma=gamma)
svm.fit(df_features, df_labels[label])
# Predict labels
df_labels['Predicted Label'] = svm.predict(df_features)
# Export the new data frame with predicted labels as output file
df_labels.to_csv(output_file, index=False)
# Compute the confusion matrix
tp = ((df_labels['Predicted Label'] == 1) & (df_labels[label] == 1)).sum()
tn = ((df_labels['Predicted Label'] == 0) & (df_labels[label] == 0)).sum()
fp = ((df_labels['Predicted Label'] == 1) & (df_labels[label] == 0)).sum()
fn = ((df_labels['Predicted Label'] == 0) & (df_labels[label] == 1)).sum()
# Compute the accuracy, precision, and recall
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
F1_score = 2 * (precision * recall / (precision + recall))
# Draw SVM scatter plot with predicted labels
plt.figure(figsize=(10, 8))
fig, ax = plt.subplots()
groups = df_labels.groupby('Predicted Label')
for name, group in groups:
ax.scatter(df.loc[group.index, 'PC1'], df.loc[group.index, 'PC2'], label=name)
ax.legend()
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('SVM Classification')
plt.show()
# Compute confusion matrix
cm = confusion_matrix(df_labels[label], df_labels['Predicted Label'])
# Plot confusion matrix using seaborn heatmap
plt.figure(figsize=(5, 5))
sns.heatmap(cm, annot=True, cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()
# print out the accuracy, precision, recall, F1 score of SVM classifier model
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', F1_score)
# Fit SVM model with cross-validation
scores = cross_validate(svm, df_features, df_labels[label], cv=10)
print("Cross validate scores:", scores)
if __name__ == "__main__":
start_time = time.time()
svm_classification('PCA10_data.csv', 'new_svm_data.csv', 'Patient index', 'Label', C=1.0, kernel='linear',
gamma='scale')
end_time = time.time()
print('Execution time: {:.2f}s'.format(end_time - start_time))
C:\Users\Scort\AppData\Local\Temp\ipykernel_30896\1008353979.py:26: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_labels['Predicted Label'] = svm.predict(df_features)
<Figure size 1000x800 with 0 Axes>
Accuracy: 0.6059066967644846 Precision: 0.5735115431348724 Recall: 0.40393667094565683 F1 score: 0.4740145618880241 Cross validate scores: {'fit_time': array([0.71421552, 0.67376304, 0.75099015, 0.68760085, 0.68915629, 0.68284726, 0.55757976, 0.66826916, 0.70177937, 0.69170809]), 'score_time': array([0.0339098 , 0.03590274, 0.03191471, 0.03291202, 0.03291225, 0.03391051, 0.03586912, 0.03391004, 0.03287673, 0.03490663]), 'test_score': array([0.54699248, 0.55451128, 0.53007519, 0.58458647, 0.54135338, 0.72932331, 0.56120527, 0.72693032, 0.47457627, 0.46892655])} Execution time: 8.76s
def dt_classification(input_file, output_file, unit_index, label, max_depth, min_samples_leaf,
min_samples_split,
max_features):
"""
:param input_file: str, input data file path
:param output_file: str, output file path
:param unit_index: str, column name of unit index
:param label: str, column name of label
:param max_depth, int, maximum depth of the tree
:param min_samples_leaf, int, minimum number of samples for a leaf node
:param min_samples_split, int, minimum number of samples with internal nodes
:param max_features, int, maximum number of features considered at each node split
"""
# Read input data file
df = pd.read_csv(input_file)
# Separate the data frame of Patient index, PCs columns
df_features = df.drop([unit_index, label], axis=1)
# Separate the data frame with Patient index and Label
df_labels = df[[unit_index, label]]
# Fit DT model
dt = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf,
min_samples_split=min_samples_split, max_features=max_features)
dt.fit(df_features, df_labels[label])
# Predict labels
df_labels['Predicted Label'] = dt.predict(df_features)
# Export the new data frame with predicted labels
df_labels.to_csv(output_file, index=False)
# Compute the confusion matrix
tp = ((df_labels['Predicted Label'] == 1) & (df_labels[label] == 1)).sum()
tn = ((df_labels['Predicted Label'] == 0) & (df_labels[label] == 0)).sum()
fp = ((df_labels['Predicted Label'] == 1) & (df_labels[label] == 0)).sum()
fn = ((df_labels['Predicted Label'] == 0) & (df_labels[label] == 1)).sum()
# Compute the accuracy, precision, and recall
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
F1_score = 2 * (precision * recall / (precision + recall))
# Draw DT scatter plot with predicted labels
plt.figure(figsize=(10, 8))
fig, ax = plt.subplots()
groups = df_labels.groupby('Predicted Label')
for name, group in groups:
ax.scatter(df.loc[group.index, 'PC1'], df.loc[group.index, 'PC2'], label=name)
ax.legend()
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('DT Classification')
plt.show()
# Compute confusion matrix
cm = confusion_matrix(df_labels[label], df_labels['Predicted Label'])
# Plot confusion matrix using seaborn heatmap
plt.figure(figsize=(5, 5))
sns.heatmap(cm, annot=True, cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()
# print out the accuracy, precision, recall, F1 score of DT classifier model
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', F1_score)
# Fit Decision Tree model with cross-validation
scores = cross_validate(dt, df_features, df_labels[label], cv=10)
print("Cross validate scores:", scores)
if __name__ == "__main__":
start_time = time.time()
dt_classification('PCA10_data.csv', 'new_dt_data.csv', 'Patient index', 'Label', 5, 5, 10, 'sqrt')
end_time = time.time()
print('Execution time: {:.2f}s'.format(end_time - start_time))
C:\Users\Scort\AppData\Local\Temp\ipykernel_30896\3175021458.py:30: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_labels['Predicted Label'] = dt.predict(df_features)
<Figure size 1000x800 with 0 Axes>
Accuracy: 0.6202031602708804 Precision: 0.5754985754985755 Recall: 0.5186136071887034 F1 score: 0.5455773126266036 Cross validate scores: {'fit_time': array([0.00698066, 0.00698066, 0.00598407, 0.00706553, 0.00797963, 0.00897741, 0.0089767 , 0.00797939, 0.0079782 , 0.00897646]), 'score_time': array([0.00199509, 0.00299191, 0.00199485, 0.00199413, 0.00199413, 0.00299191, 0.00099564, 0.00199461, 0.00099802, 0.00199461]), 'test_score': array([0.53383459, 0.54511278, 0.54511278, 0.58646617, 0.53947368, 0.69924812, 0.61581921, 0.66101695, 0.48399247, 0.47457627])} Execution time: 0.50s
# Read the input file
df = pd.read_csv('PCA10_data.csv')
# Define the range of k values to test
n_range = range(1, 101)
# Separate the data frame of Patient index, PCs columns
df_features = df.drop(['Patient index', 'Label'], axis=1)
# Separate the data frame with Patient index and Label
df_labels = df[['Patient index', 'Label']]
# Initialize the list to store the mean accuracy scores for each n value
knn_mean_scores = []
dt_mean_scores = []
svm_mean_scores = []
# Calculate the mean accuracy scores for each n value
for n in n_range:
knn = KNeighborsClassifier(n_neighbors=n)
svm = SVC(C=n)
dt = DecisionTreeClassifier(max_depth=n)
knn_scores = cross_val_score(knn, df_features, df_labels['Label'], cv=10, scoring='accuracy')
svm_scores = cross_val_score(svm, df_features, df_labels['Label'], cv=10, scoring='accuracy')
dt_scores = cross_val_score(dt, df_features, df_labels['Label'], cv=10, scoring='accuracy')
knn_mean_scores.append(knn_scores.mean())
svm_mean_scores.append(svm_scores.mean())
dt_mean_scores.append(dt_scores.mean())
# Plot the mean accuracy scores against each k value
plt.figure(figsize=(16, 8))
plt.plot(n_range, knn_mean_scores, label='KNN')
plt.plot(n_range, svm_mean_scores, label='SVM')
plt.plot(n_range, dt_mean_scores, label='Decision Tree')
plt.xlabel('Value of parameters (k, C, max_depth)')
plt.ylabel('Cross-validated accuracy')
plt.legend()
plt.show()
def knn_clustering(input_file, output_file, unit_index, k_clusters):
# Read the input file
df = pd.read_csv(input_file)
# Separate the data frame of Patient index, PCs columns
df_features = df.drop([unit_index], axis=1)
# Fit KMeans model
kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(df_features)
# Predict labels
predicted_labels = kmeans.labels_
# Add classifications to the original dataframe
df['Classification'] = predicted_labels
# Export the new data frame with classifications as output file
df.to_csv(output_file, index=False,
columns=['Patient index', 'Classification', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8',
'PC9', 'PC10'])
# Draw K-Means scatter plot with classifications
plt.figure(figsize=(12, 9))
colors = ['r', 'g', 'b', 'c', 'm']
for i in range(k_clusters):
cluster = df[df['Classification'] == i]
plt.scatter(cluster['PC1'], cluster['PC2'], color=colors[i], label='Type {}'.format(i + 1))
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('K-Means Clustering')
plt.legend()
plt.savefig('kmeans_plot.png')
plt.show()
if __name__ == '__main__':
start_time = time.time()
knn_clustering('PCA10_data.csv', 'new_kmeans_data.csv', 'Patient index', 5)
end_time = time.time()
print('Execution time: {:.2f}s'.format(end_time - start_time))
C:\Users\Scort\PycharmProjects\pythonProject\venv\lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn(
Execution time: 0.89s